Last updated: 2015-10-24

Code version: 4abacbabb75c62d74de567b85e4503d4985f2061

Objective

Previously, we compared normalized coefficient of variations across individuals. Here, we will also compare the mean gene expression across individuals, using single cell sequencing data and bulk RNA-seq data.

It would be interesting to learn the possible overlap or non-overlap between the genes that we observed significant individual differences in coefficient of variations versus those different in mean gene expression levels across cells.

From here on we will work with the filtered data without NA19098.r2.

Set up

library("data.table")
library("dplyr")
library("limma")
library("edgeR")
library("ggplot2")
library("grid")
theme_set(theme_bw(base_size = 12))
source("functions.R")

Prepare data

Input annotation of only QC-filtered single cells. Remove NA19098.r2

anno_qc <- read.table("../data/annotation-filter.txt", header = TRUE,
                   stringsAsFactors = FALSE)
is_include <- anno_qc$batch != "NA19098.r2"
anno_qc_filter <- anno_qc[which(is_include), ]

Import endogeneous gene molecule counts that are QC-filtered, CPM-normalized, ERCC-normalized, and also processed to remove unwanted variation from batch effet. ERCC genes are removed from this file.

molecules_ENSG <- read.table("../data/molecules-final.txt", header = TRUE, stringsAsFactors = FALSE)
molecules_ENSG <- molecules_ENSG[ , is_include]

Input moleclule counts before log2 CPM transformation. This file is used to compute percent zero-count cells per sample.

molecules_sparse <- read.table("../data/molecules-filter.txt", header = TRUE, stringsAsFactors = FALSE)

molecules_sparse <- molecules_sparse[grep("ENSG", rownames(molecules_sparse)), ]
stopifnot( all.equal(rownames(molecules_ENSG), rownames(molecules_sparse)) )

Normalize coefficient of variation

Compute coefficient of variation.

# Compute CV and mean of normalized molecule counts (take 2^(log2-normalized count))

molecules_cv_batch_ENSG <- 
  lapply(1:length(unique(anno_qc_filter$batch)), function(per_batch) {
      molecules_per_batch <- 2^molecules_ENSG[ , unique(anno_qc_filter$batch) == unique(anno_qc_filter$batch)[per_batch] ]
      mean_per_gene <- apply(molecules_per_batch, 1, mean, na.rm = TRUE)
      sd_per_gene <- apply(molecules_per_batch, 1, sd, na.rm = TRUE)
      cv_per_gene <- data.frame(mean = mean_per_gene,
                                sd = sd_per_gene,
                                cv = sd_per_gene/mean_per_gene)
      rownames(cv_per_gene) <- rownames(molecules_ENSG)
  
      # cv_per_gene <- cv_per_gene[rowSums(is.na(cv_per_gene)) == 0, ]
      cv_per_gene$batch <- unique(anno_qc_filter$batch)[per_batch]
      
      # Add sparsity percent
      molecules_sparse_per_batch <- molecules_sparse[ , unique(anno_qc_filter$batch) == unique(anno_qc_filter$batch)[per_batch]]
      cv_per_gene$sparse <- rowMeans(as.matrix(molecules_sparse_per_batch) == 0)
        
      return(cv_per_gene)
      }) 
names(molecules_cv_batch_ENSG) <- unique(anno_qc_filter$batch)

Merge summary data.frames.

df_ENSG <- do.call(rbind, molecules_cv_batch_ENSG)

Compute rolling medians across all samples.

library(zoo)
# Compute a data-wide coefficient of variation on CPM normalized counts.
data_cv_ENSG <- apply(2^molecules_ENSG, 1, sd)/apply(2^molecules_ENSG, 1, mean)

# Order of genes by mean expression levels
order_gene <- order(apply(2^molecules_ENSG, 1, mean))

# Rolling medians of log10 squared CV by mean expression levels
roll_medians <- rollapply(log10(data_cv_ENSG^2)[order_gene], width = 50, by = 25,
                         FUN = median, fill = list("extend", "extend", "NA") )
ii_na <- which( is.na(roll_medians) )
roll_medians[ii_na] <- median( log10(data_cv_ENSG^2)[order_gene][ii_na] )

names(roll_medians) <- rownames(molecules_ENSG)[order_gene]


# re-order rolling medians
reorder_gene <- match(rownames(molecules_ENSG), names(roll_medians) )
roll_medians <- roll_medians[ reorder_gene ]

stopifnot( all.equal(names(roll_medians), rownames(molecules_ENSG) ) )

Compute adjusted coefficient of variation.

# adjusted coefficient of variation on log10 scale
log10cv2_adj_ENSG <- 
  lapply(1:length(molecules_cv_batch_ENSG), function(per_batch) {
    foo <- log10(molecules_cv_batch_ENSG[[per_batch]]$cv^2) - roll_medians
    return(foo)
})
df_ENSG$log10cv2_adj_ENSG <- do.call(c, log10cv2_adj_ENSG)

Compare coefficients of variations

library(limma)

df_limma <- matrix(df_ENSG$log10cv2_adj_ENSG, 
                      nrow = nrow(molecules_ENSG), ncol = 8, byrow = FALSE)

design <- data.frame(individual = factor(rep(unique(anno_qc_filter$individual), each = 3) ),
                     rep = factor(rep(c(1:3), times = 3)) )
design <- design[ with(design, !(individual == "NA19098" & rep == "2")), ]

colnames(df_limma) <- with(design, paste0(individual, rep))

fit_limma <- lmFit(df_limma, design = model.matrix( ~ individual, data = design))                      
fit_limma <- eBayes(fit_limma)

False discovery control adjustment.

F.p.adj <- p.adjust(fit_limma$F.p.value, method = "fdr")

Cutoffs

df_cuts <- data.frame(cuts = c(.001, .01, .05, .1, .15, .2))
df_cuts$sig_count <- sapply(1:6, function(per_cut) {
                    sum(F.p.adj < df_cuts$cuts[per_cut] )
                    })
df_cuts
   cuts sig_count
1 0.001        18
2 0.010        42
3 0.050       103
4 0.100       177
5 0.150       328
6 0.200       464

False discovery control adjutment.

F.p.adj <- p.adjust(fit_limma$F.p.value, method = "fdr")

Squared CV vs. mean expression level

We plotted out per gene average CVs across samples versus average expression level for the endogeneous genes, and identified the genes classifed as significantly different between individuals in coefficients of variation.

df_compare <- 
  data.frame(mean = rowMeans( as.matrix( 
                      do.call(cbind, lapply(molecules_cv_batch_ENSG, "[[", 1) ) ) ),
             cv2 = rowMeans( as.matrix( 
                      do.call(cbind, lapply(molecules_cv_batch_ENSG, "[[", 3) ) )^2 ),
             adj_cv2 = rowMeans( 10^as.matrix( 
                      do.call(cbind, log10cv2_adj_ENSG) ) ) )
             
library(broman)
crayon <- brocolors("crayons")
par(mar=c(5,5,3,1))
with(df_compare, plot(x = log10(mean), y = cv2, pch = 1, cex = 1, col = "grey50",
                      lwd = .5,
                      ylab = "average squared coeffcient of variations \n across samples",
                      xlab = "log10 average of mean CPM across samples") )
with(df_compare[F.p.adj < .2, ], points(x = log10(mean), y = cv2, pch = 16, cex = .6,
                                        col = crayon["Tumbleweed"]) )
with(df_compare[F.p.adj < .1, ], points(x = log10(mean), y = cv2, pch = 16, cex = .6,
                                        col = crayon["Orange"]) )
with(df_compare[F.p.adj < .01, ], points(x = log10(mean), y = cv2, pch = 16, cex = .6,
                                        col = crayon["Scarlet"]) )
title(main = "Avg. Squred CV vs. log10(Avg. mean count)")
legend("topright", pch = c(1, 16, 16, 16), 
       legend = c("All genes", "Adj p-value < .2", 
                  "Adj p-value < .1", "Adj p-value < .01"),
       col = c("grey50", crayon[c("Tumbleweed", "Orange", "Scarlet")]),
       bty = "n")

par(mar=c(5,5,3,1))
with(df_compare, plot(x = log10(mean), y = adj_cv2, pch = 1, cex = 1, col = "grey50",
                      lwd = .5,
                      ylab = "average squared coeffcient of variations \n across samples",
                      xlab = "log10 average of mean CPM across samples") )
with(df_compare[F.p.adj < .2, ], points(x = log10(mean), y = adj_cv2, pch = 16, cex = .6,
                                        col = crayon["Tumbleweed"]) )
with(df_compare[F.p.adj < .1, ], points(x = log10(mean), y = adj_cv2, pch = 16, cex = .6,
                                        col = crayon["Orange"]) )
with(df_compare[F.p.adj < .01, ], points(x = log10(mean), y = adj_cv2, pch = 16, cex = .6,
                                        col = crayon["Scarlet"]) )
title(main = "Avg. Adjusted Squred CV vs. log10(Avg. mean count)")
legend("topright", pch = c(1, 16, 16, 16), 
       legend = c("All genes", "Adj p-value < .2", 
                  "Adj p-value < .1", "Adj p-value < .01"),
       col = c("grey50", crayon[c("Tumbleweed", "Orange", "Scarlet")]),
       bty = "n")

Investigate…

Our naive analysis seems to pick up genes with different density between individuals.

library(gridExtra)
order_low_cv <- order(df_compare[F.p.adj < .01, ]$cv)
low_plots <- lapply(1:6, function(i) {
  ind <- order_low_cv[i]
  ggplot(data.frame(values = unlist(molecules_ENSG[ind, ]),
                    individual = factor(anno_qc_filter$individual),
                    replicate = factor(anno_qc_filter$replicate),
                    batch = factor(anno_qc_filter$batch), check.rows = F), 
         aes(x = values)) + 
    geom_density(aes(group = batch, col = individual)) + 
    ggtitle(paste(rownames(molecules_ENSG)[ind],
                  ";CV = ", round(df_compare[F.p.adj < .01, ]$cv[ind], 2) ) )
})
grid.arrange(grobs = low_plots, ncol = 2)

order_high_cv <- order(df_compare[F.p.adj < .01, ]$cv, decreasing = TRUE)
high_plots <- lapply(1:6, function(i) {
  ind <- order_high_cv[i]
  ggplot(data.frame(values = unlist(molecules_ENSG[ind, ]),
                    individual = factor(anno_qc_filter$individual),
                    replicate = factor(anno_qc_filter$replicate),
                    batch = factor(anno_qc_filter$batch), check.rows = F), 
         aes(x = values)) + 
    geom_density(aes(group = batch, col = individual)) + 
    ggtitle(paste(rownames(molecules_ENSG)[ind],
                  ";CV = ", round(df_compare[F.p.adj < .01, ]$cv[ind], 2) ) )
})
grid.arrange(grobs = high_plots, ncol = 2)

Annotation

ENSEMBLE database

sig_cv <- order(F.p.adj)[F.p.adj < .05]
library("biomaRt")
ensembl <- useMart(host = "grch37.ensembl.org",
                   biomart = "ENSEMBL_MART_ENSEMBL",
                   dataset = "hsapiens_gene_ensembl")

differential_CV_genes_info <- getBM(attributes = c("ensembl_gene_id", "chromosome_name",
                                                   "external_gene_name", "transcript_count",
                                                   "description"),
                                    filters = "ensembl_gene_id",
                                    values = rownames(molecules_ENSG[sig_cv, ]),
                                    mart = ensembl)
differential_CV_genes_info
    ensembl_gene_id chromosome_name external_gene_name transcript_count
1   ENSG00000013288               4             MAN2B2                5
2   ENSG00000013375               6               PGM3               14
3   ENSG00000036549               1               ZZZ3               11
4   ENSG00000048471              16              SNX29               10
5   ENSG00000068400               X            GRIPAP1                9
6   ENSG00000070669               7               ASNS               17
7   ENSG00000073910              13                FRY                7
8   ENSG00000074590              12              NUAK1                4
9   ENSG00000075240              22             GRAMD4                7
10  ENSG00000079819               6            EPB41L2               31
11  ENSG00000089195              20              TRMT6                5
12  ENSG00000092820               6                EZR                4
13  ENSG00000100564              14               PIGH               12
14  ENSG00000100968              14             NFATC4               33
15  ENSG00000101945               X            SUV39H1                5
16  ENSG00000102931              16             ARL2BP                4
17  ENSG00000103042              16            SLC38A7               16
18  ENSG00000103363              16              TCEB2                5
19  ENSG00000104872              19             PIH1D1               24
20  ENSG00000105229              19              PIAS4                6
21  ENSG00000106366               7           SERPINE1                2
22  ENSG00000106554               7             CHCHD3                9
23  ENSG00000108091              10              CCDC6                3
24  ENSG00000108479              17              GALK1               10
25  ENSG00000108799              17               EZH1               25
26  ENSG00000108840              17              HDAC5               13
27  ENSG00000110756              11               HPS5               11
28  ENSG00000112245               6             PTP4A1                5
29  ENSG00000112339               6              HBS1L               18
30  ENSG00000115808               2               STRN                3
31  ENSG00000115816               2              CEBPZ                3
32  ENSG00000118217               1               ATF6                2
33  ENSG00000124067              16            SLC12A4               20
34  ENSG00000125841              20              NRSN2                9
35  ENSG00000127952               7             STYXL1               11
36  ENSG00000130208              19              APOC1               10
37  ENSG00000131653              16              TRAF7                7
38  ENSG00000133424              22              LARGE               19
39  ENSG00000135063               9           FAM189A2                6
40  ENSG00000135250               7              SRPK2               15
41  ENSG00000135315               6           KIAA1009                6
42  ENSG00000136371              15              MTHFS                4
43  ENSG00000137413               6               TAF8               10
44  ENSG00000137710              11                RDX               18
45  ENSG00000138750               4              NUP54               20
46  ENSG00000139974              14            SLC38A6               21
47  ENSG00000140905              16               GCSH                6
48  ENSG00000142698               1            C1orf94                2
49  ENSG00000143622               1               RIT1                6
50  ENSG00000144120               2            TMEM177                6
51  ENSG00000145388               4            METTL14                4
52  ENSG00000149054              11             ZNF215                6
53  ENSG00000149179              11           C11orf49               24
54  ENSG00000149806              11                FAU                9
55  ENSG00000150459              13              SAP18                7
56  ENSG00000152359               5               POC5               15
57  ENSG00000153044               5              CENPH                6
58  ENSG00000153250               2              RBMS1               15
59  ENSG00000156011               8               PSD3               18
60  ENSG00000156531               X               PHF6                6
61  ENSG00000158545              16             ZC3H18               12
62  ENSG00000161021               5              MAML1                4
63  ENSG00000162458               1             FBLIM1               16
64  ENSG00000163714               3             U2SURP               18
65  ENSG00000163950               4               SLBP                6
66  ENSG00000164074               4            C4orf29                5
67  ENSG00000165186               X             PTCHD1                2
68  ENSG00000166822              16           TMEM170A                7
69  ENSG00000168038               3               ULK4               12
70  ENSG00000168610              17              STAT3               14
71  ENSG00000170471              20            RALGAPB               10
72  ENSG00000171466              19             ZNF562               12
73  ENSG00000171603               1             CLSTN1                6
74  ENSG00000171848               2               RRM2               10
75  ENSG00000172113               3               NME6               21
76  ENSG00000172534               X              HCFC1                5
77  ENSG00000172831              16               CES2               12
78  ENSG00000173486              11              FKBP2                6
79  ENSG00000174238              17             PITPNA               11
80  ENSG00000174744              11              BRMS1                9
81  ENSG00000179195              12             ZNF664               14
82  ENSG00000180398               2              MCFD2               17
83  ENSG00000181038              17            METTL23               14
84  ENSG00000183150              12              GPR19                5
85  ENSG00000183684              17             ALYREF                5
86  ENSG00000183889              16             NPIPA7               13
87  ENSG00000185043              15               CIB1                1
88  ENSG00000185475              11           TMEM179B                5
89  ENSG00000185650              14            ZFP36L1                7
90  ENSG00000187664              19             HAPLN4                2
91  ENSG00000188636              22             LDOC1L                1
92  ENSG00000188643               1            S100A16                5
93  ENSG00000196267              19             ZNF836                6
94  ENSG00000197451               5            HNRNPAB                8
95  ENSG00000198513              14               ATL1               12
96  ENSG00000198633              19             ZNF534                4
97  ENSG00000204442              13            FAM155A                1
98  ENSG00000204469               6             PRRC2A               14
99  ENSG00000213903              14              LTB4R                5
100 ENSG00000215114               8             UBXN2B                5
101 ENSG00000219545               7           RPA3-AS1                7
102 ENSG00000257093               7           KIAA1147                2
103 ENSG00000270188               1          MTRNR2L11                1
                                                                                                                    description
1                                                         mannosidase, alpha, class 2B, member 2 [Source:HGNC Symbol;Acc:29623]
2                                                                            phosphoglucomutase 3 [Source:HGNC Symbol;Acc:8907]
3                                                              zinc finger, ZZ-type containing 3 [Source:HGNC Symbol;Acc:24523]
4                                                                               sorting nexin 29 [Source:HGNC Symbol;Acc:30542]
5                                                                     GRIP1 associated protein 1 [Source:HGNC Symbol;Acc:18706]
6                                                    asparagine synthetase (glutamine-hydrolyzing) [Source:HGNC Symbol;Acc:753]
7                                                                     furry homolog (Drosophila) [Source:HGNC Symbol;Acc:20367]
8                                                               NUAK family, SNF1-like kinase, 1 [Source:HGNC Symbol;Acc:14311]
9                                                                       GRAM domain containing 4 [Source:HGNC Symbol;Acc:29113]
10                                                   erythrocyte membrane protein band 4.1-like 2 [Source:HGNC Symbol;Acc:3379]
11                                              tRNA methyltransferase 6 homolog (S. cerevisiae) [Source:HGNC Symbol;Acc:20900]
12                                                                                         ezrin [Source:HGNC Symbol;Acc:12691]
13                                       phosphatidylinositol glycan anchor biosynthesis, class H [Source:HGNC Symbol;Acc:8964]
14                      nuclear factor of activated T-cells, cytoplasmic, calcineurin-dependent 4 [Source:HGNC Symbol;Acc:7778]
15                                          suppressor of variegation 3-9 homolog 1 (Drosophila) [Source:HGNC Symbol;Acc:11479]
16                                                ADP-ribosylation factor-like 2 binding protein [Source:HGNC Symbol;Acc:17146]
17                                                            solute carrier family 38, member 7 [Source:HGNC Symbol;Acc:25582]
18                    transcription elongation factor B (SIII), polypeptide 2 (18kDa, elongin B) [Source:HGNC Symbol;Acc:11619]
19                                                                      PIH1 domain containing 1 [Source:HGNC Symbol;Acc:26075]
20                                                        protein inhibitor of activated STAT, 4 [Source:HGNC Symbol;Acc:17002]
21  serpin peptidase inhibitor, clade E (nexin, plasminogen activator inhibitor type 1), member 1 [Source:HGNC Symbol;Acc:8583]
22                                       coiled-coil-helix-coiled-coil-helix domain containing 3 [Source:HGNC Symbol;Acc:21906]
23                                                               coiled-coil domain containing 6 [Source:HGNC Symbol;Acc:18782]
24                                                                                galactokinase 1 [Source:HGNC Symbol;Acc:4118]
25                                                       enhancer of zeste homolog 1 (Drosophila) [Source:HGNC Symbol;Acc:3526]
26                                                                         histone deacetylase 5 [Source:HGNC Symbol;Acc:14068]
27                                                                   Hermansky-Pudlak syndrome 5 [Source:HGNC Symbol;Acc:17022]
28                                                protein tyrosine phosphatase type IVA, member 1 [Source:HGNC Symbol;Acc:9634]
29                                                                      HBS1-like (S. cerevisiae) [Source:HGNC Symbol;Acc:4834]
30                                                          striatin, calmodulin binding protein [Source:HGNC Symbol;Acc:11424]
31                                                  CCAAT/enhancer binding protein (C/EBP), zeta [Source:HGNC Symbol;Acc:24218]
32                                                               activating transcription factor 6 [Source:HGNC Symbol;Acc:791]
33                           solute carrier family 12 (potassium/chloride transporter), member 4 [Source:HGNC Symbol;Acc:10913]
34                                                                                   neurensin 2 [Source:HGNC Symbol;Acc:16229]
35                                                  serine/threonine/tyrosine interacting-like 1 [Source:HGNC Symbol;Acc:18165]
36                                                                              apolipoprotein C-I [Source:HGNC Symbol;Acc:607]
37                                 TNF receptor-associated factor 7, E3 ubiquitin protein ligase [Source:HGNC Symbol;Acc:20456]
38                                                                       like-glycosyltransferase [Source:HGNC Symbol;Acc:6511]
39                                                family with sequence similarity 189, member A2 [Source:HGNC Symbol;Acc:24820]
40                                                                         SRSF protein kinase 2 [Source:HGNC Symbol;Acc:11306]
41                                                                                      KIAA1009 [Source:HGNC Symbol;Acc:21107]
42               5,10-methenyltetrahydrofolate synthetase (5-formyltetrahydrofolate cyclo-ligase) [Source:HGNC Symbol;Acc:7437]
43               TAF8 RNA polymerase II, TATA box binding protein (TBP)-associated factor, 43kDa [Source:HGNC Symbol;Acc:17300]
44                                                                                        radixin [Source:HGNC Symbol;Acc:9944]
45                                                                             nucleoporin 54kDa [Source:HGNC Symbol;Acc:17359]
46                                                            solute carrier family 38, member 6 [Source:HGNC Symbol;Acc:19863]
47                                        glycine cleavage system protein H (aminomethyl carrier) [Source:HGNC Symbol;Acc:4208]
48                                                            chromosome 1 open reading frame 94 [Source:HGNC Symbol;Acc:28250]
49                                                                       Ras-like without CAAX 1 [Source:HGNC Symbol;Acc:10023]
50                                                                     transmembrane protein 177 [Source:HGNC Symbol;Acc:28143]
51                                                                     methyltransferase like 14 [Source:HGNC Symbol;Acc:29330]
52                                                                       zinc finger protein 215 [Source:HGNC Symbol;Acc:13007]
53                                                           chromosome 11 open reading frame 49 [Source:HGNC Symbol;Acc:28720]
54                    Finkel-Biskis-Reilly murine sarcoma virus (FBR-MuSV) ubiquitously expressed [Source:HGNC Symbol;Acc:3597]
55                                                               Sin3A-associated protein, 18kDa [Source:HGNC Symbol;Acc:10530]
56                                                                       POC5 centriolar protein [Source:HGNC Symbol;Acc:26658]
57                                                                          centromere protein H [Source:HGNC Symbol;Acc:17268]
58                                       RNA binding motif, single stranded interacting protein 1 [Source:HGNC Symbol;Acc:9907]
59                                                       pleckstrin and Sec7 domain containing 3 [Source:HGNC Symbol;Acc:19093]
60                                                                          PHD finger protein 6 [Source:HGNC Symbol;Acc:18145]
61                                                           zinc finger CCCH-type containing 18 [Source:HGNC Symbol;Acc:25091]
62                                                                mastermind-like 1 (Drosophila) [Source:HGNC Symbol;Acc:13632]
63                                                                 filamin binding LIM protein 1 [Source:HGNC Symbol;Acc:24686]
64                                                    U2 snRNP-associated SURP domain containing [Source:HGNC Symbol;Acc:30855]
65                                                                     stem-loop binding protein [Source:HGNC Symbol;Acc:10904]
66                                                            chromosome 4 open reading frame 29 [Source:HGNC Symbol;Acc:26111]
67                                                                   patched domain containing 1 [Source:HGNC Symbol;Acc:26392]
68                                                                    transmembrane protein 170A [Source:HGNC Symbol;Acc:29577]
69                                                                          unc-51 like kinase 4 [Source:HGNC Symbol;Acc:15784]
70              signal transducer and activator of transcription 3 (acute-phase response factor) [Source:HGNC Symbol;Acc:11364]
71                                   Ral GTPase activating protein, beta subunit (non-catalytic) [Source:HGNC Symbol;Acc:29221]
72                                                                       zinc finger protein 562 [Source:HGNC Symbol;Acc:25950]
73                                                                                 calsyntenin 1 [Source:HGNC Symbol;Acc:17447]
74                                                                   ribonucleotide reductase M2 [Source:HGNC Symbol;Acc:10452]
75                                                      NME/NM23 nucleoside diphosphate kinase 6 [Source:HGNC Symbol;Acc:20567]
76                                                   host cell factor C1 (VP16-accessory protein) [Source:HGNC Symbol;Acc:4839]
77                                                                             carboxylesterase 2 [Source:HGNC Symbol;Acc:1864]
78                                                                 FK506 binding protein 2, 13kDa [Source:HGNC Symbol;Acc:3718]
79                                                   phosphatidylinositol transfer protein, alpha [Source:HGNC Symbol;Acc:9001]
80                                                         breast cancer metastasis suppressor 1 [Source:HGNC Symbol;Acc:17262]
81                                                                       zinc finger protein 664 [Source:HGNC Symbol;Acc:25406]
82                                                      multiple coagulation factor deficiency 2 [Source:HGNC Symbol;Acc:18451]
83                                                                     methyltransferase like 23 [Source:HGNC Symbol;Acc:26988]
84                                                                  G protein-coupled receptor 19 [Source:HGNC Symbol;Acc:4473]
85                                                                         Aly/REF export factor [Source:HGNC Symbol;Acc:19071]
86                                                                         Protein PKD1P1  [Source:UniProtKB/TrEMBL;Acc:H0YEP2]
87                                                     calcium and integrin binding 1 (calmyrin) [Source:HGNC Symbol;Acc:16920]
88                                                                    transmembrane protein 179B [Source:HGNC Symbol;Acc:33744]
89                                                               ZFP36 ring finger protein-like 1 [Source:HGNC Symbol;Acc:1107]
90                                                    hyaluronan and proteoglycan link protein 4 [Source:HGNC Symbol;Acc:31357]
91                                               leucine zipper, down-regulated in cancer 1-like [Source:HGNC Symbol;Acc:13343]
92                                                              S100 calcium binding protein A16 [Source:HGNC Symbol;Acc:20441]
93                                                                       zinc finger protein 836 [Source:HGNC Symbol;Acc:34333]
94                                                    heterogeneous nuclear ribonucleoprotein A/B [Source:HGNC Symbol;Acc:5034]
95                                                                             atlastin GTPase 1 [Source:HGNC Symbol;Acc:11231]
96                                                                       zinc finger protein 534 [Source:HGNC Symbol;Acc:26337]
97                                                 family with sequence similarity 155, member A [Source:HGNC Symbol;Acc:33877]
98                                                                   proline-rich coiled-coil 2A [Source:HGNC Symbol;Acc:13918]
99                                                                        leukotriene B4 receptor [Source:HGNC Symbol;Acc:6713]
100                                                                        UBX domain protein 2B [Source:HGNC Symbol;Acc:27035]
101                                                                         RPA3 antisense RNA 1 [Source:HGNC Symbol;Acc:48955]
102                                                                                     KIAA1147 [Source:HGNC Symbol;Acc:29472]
103                                                                 MT-RNR2-like 11 (pseudogene) [Source:HGNC Symbol;Acc:37168]

Protein-protein interaction

Export significant genes.

write.table(rownames(molecules_ENSG[F.p.adj < .05, ]), 
            file = "../data/genelist-cv-sig.txt", 
            row.names = FALSE, col.names = FALSE, quote = FALSE)

Use string analysis data base to look for protein protein interaction of these differential CV genes.

This is the confidence view. Stronger associations are represented by thicker lines. This is the confidence view

This is the evidence view. Different line colors represent the types of evidence for the association. This is the evidence view

GO analysis

[Description and steps are copied from my work with Sidney]

GOstats performed Hypergeometric test for gene set enrichment. The test can take account into the conditional relationship between GO terms. Standard GOstats output contains the following information:

  1. GPID: GO term ID number
  2. Pvalue: p-vlaue of the Hypergeometric test for over-representation
  3. OddsRatio: effect size
  4. ExpCount: expected number of genes
  5. Count: observed number of genes
  6. Size: number of possible genes given the defined gene universe

GO analysis workflow is consisted of two major steps. First, we identified GO terms significant enriched given a p-value cutoff, which we set to 1 so that all possible GO terms associated with the genes can be included. We do this to eliminate false negatives. Then, we take the union of the GO terms across functional categories of interest (e.g., translation attentuation and post-translational buffering). The GO terms that are significant in at least one of the four functional categories are presented in the results.

Note. The following heatmaps are generated based on the p-values of the gene sets.

if (file.exists("rda/cv-annotation/go-cv.rda")) {
  load("rda/cv-annotation/go-cv.rda")  
} else {
go_sig_cv <- GOtest(my_ensembl_gene_universe = rownames(molecules_ENSG),
              my_ensembl_gene_test = rownames(molecules_ENSG)[F.p.adj < .05],
              pval_cutoff = 1, ontology=c("BP","CC","MF") )
save(molecules_ENSG,
     F.p.adj, go_sig_cv, file = "rda/cv-annotation/go-cv.rda")
}

Extract terms

if (file.exists("rda/cv-annotation/go-cv-terms.rda")) {
  load("rda/cv-annotation/go-cv-terms.rda")  
} else {
  # Biological process
  goterms_bp <- summary(go_sig_cv$GO$BP, pvalue = 1)
  goterms_bp <- data.frame(ID = goterms_bp[[1]],
                           Pvalue = goterms_bp[[2]],
                           Terms = goterms_bp[[7]])
  goterms_bp <- goterms_bp[order(goterms_bp$Pvalue), ]
  
  # Cellular component
  goterms_cc <- summary(go_sig_cv$GO$CC, pvalue = 1)
  goterms_cc <- data.frame(ID = goterms_cc[[1]],
                           Pvalue = goterms_cc[[2]],
                           Terms = goterms_cc[[7]])
  goterms_cc <- goterms_cc[order(goterms_cc$Pvalue), ]

  # Molecular function
  goterms_mf <- summary(go_sig_cv$GO$MF, pvalue = 1)
  goterms_mf <- data.frame(ID = goterms_mf[[1]],
                           Pvalue = goterms_mf[[2]],
                           Terms = goterms_mf[[7]])
  goterms_mf <- goterms_mf[order(goterms_mf$Pvalue), ]

  save(goterms_bp, goterms_cc, goterms_mf, 
       file = "rda/cv-annotation/go-cv-terms.rda")
}

*Biological process

dim(goterms_bp)
[1] 1899    3
kable(goterms_bp[1:50, ])
ID Pvalue Terms
GO:0071294 0.0000000 cellular response to zinc ion
GO:0010043 0.0000000 response to zinc ion
GO:0071276 0.0000000 cellular response to cadmium ion
GO:0046686 0.0000043 response to cadmium ion
GO:0071248 0.0000050 cellular response to metal ion
GO:1990267 0.0000079 response to transition metal nanoparticle
GO:0071241 0.0000120 cellular response to inorganic substance
GO:0045926 0.0000139 negative regulation of growth
GO:0010950 0.0001659 positive regulation of endopeptidase activity
GO:0010952 0.0003076 positive regulation of peptidase activity
GO:0036017 0.0004702 response to erythropoietin
GO:0036018 0.0004702 cellular response to erythropoietin
GO:0045091 0.0004702 regulation of single stranded viral RNA replication via double stranded DNA intermediate
GO:0043280 0.0007251 positive regulation of cysteine-type endopeptidase activity involved in apoptotic process
GO:0040008 0.0008488 regulation of growth
GO:2001056 0.0009056 positive regulation of cysteine-type endopeptidase activity
GO:0039692 0.0011619 single stranded viral RNA replication via double stranded DNA intermediate
GO:0071371 0.0011619 cellular response to gonadotropin stimulus
GO:0009611 0.0012159 response to wounding
GO:0045862 0.0012961 positive regulation of proteolysis
GO:0070979 0.0013827 protein K11-linked ubiquitination
GO:0052548 0.0013872 regulation of endopeptidase activity
GO:0070316 0.0016172 regulation of G0 to G1 transition
GO:0010038 0.0017304 response to metal ion
GO:0001101 0.0019230 response to acid chemical
GO:0052547 0.0019253 regulation of peptidase activity
GO:0045023 0.0021437 G0 to G1 transition
GO:0071345 0.0033358 cellular response to cytokine stimulus
GO:0034698 0.0034054 response to gonadotropin
GO:0035456 0.0034054 response to interferon-beta
GO:0039694 0.0034054 viral RNA genome replication
GO:0039703 0.0034054 RNA replication
GO:0006978 0.0041380 DNA damage response, signal transduction by p53 class mediator resulting in transcription of p21 class mediator
GO:0043065 0.0044896 positive regulation of apoptotic process
GO:0043068 0.0047411 positive regulation of programmed cell death
GO:0042772 0.0049369 DNA damage response, signal transduction resulting in transcription
GO:0046688 0.0049369 response to copper ion
GO:0071229 0.0054035 cellular response to acid chemical
GO:0048519 0.0056520 negative regulation of biological process
GO:0060337 0.0057822 type I interferon signaling pathway
GO:0071357 0.0057822 cellular response to type I interferon
GO:0072210 0.0058007 metanephric nephron development
GO:0010942 0.0059608 positive regulation of cell death
GO:0044707 0.0060654 single-multicellular organism process
GO:0034340 0.0061873 response to type I interferon
GO:1903900 0.0065011 regulation of viral life cycle
GO:0045069 0.0066088 regulation of viral genome replication
GO:0043281 0.0067085 regulation of cysteine-type endopeptidase activity involved in apoptotic process
GO:0001975 0.0067283 response to amphetamine
GO:0040007 0.0068687 growth

*Cellular component

dim(goterms_cc)
[1] 230   3
kable(goterms_cc[1:50, ])
ID Pvalue Terms
GO:0000152 0.0029964 nuclear ubiquitin ligase complex
GO:0032797 0.0045608 SMN complex
GO:0097504 0.0045608 Gemini of coiled bodies
GO:0031519 0.0053634 PcG protein complex
GO:0019035 0.0094132 viral integration complex
GO:0035098 0.0096514 ESC/E(Z) complex
GO:0034719 0.0108718 SMN-Sm protein complex
GO:0005680 0.0149138 anaphase-promoting complex
GO:0000151 0.0177601 ubiquitin ligase complex
GO:0008043 0.0187388 intracellular ferritin complex
GO:0009330 0.0187388 DNA topoisomerase complex (ATP-hydrolyzing)
GO:0070288 0.0187388 ferritin complex
GO:0001740 0.0279776 Barr body
GO:0097440 0.0279776 apical dendrite
GO:0043204 0.0341473 perikaryon
GO:0061200 0.0371304 clathrin-sculpted gamma-aminobutyric acid transport vesicle
GO:0061202 0.0371304 clathrin-sculpted gamma-aminobutyric acid transport vesicle membrane
GO:0072562 0.0448923 blood microparticle
GO:0000805 0.0461979 X chromosome
GO:0060198 0.0461979 clathrin-sculpted vesicle
GO:0000276 0.0551810 mitochondrial proton-transporting ATP synthase complex, coupling factor F(o)
GO:0000974 0.0551810 Prp19 complex
GO:0030870 0.0551810 Mre11 complex
GO:0031461 0.0556633 cullin-RING ubiquitin ligase complex
GO:0005681 0.0571076 spliceosomal complex
GO:0035102 0.0640804 PRC1 complex
GO:0015030 0.0642553 Cajal body
GO:0048471 0.0781674 perinuclear region of cytoplasm
GO:0000790 0.0868045 nuclear chromatin
GO:0045263 0.0988565 proton-transporting ATP synthase complex, coupling factor F(o)
GO:0031011 0.1073490 Ino80 complex
GO:0033202 0.1073490 DNA helicase complex
GO:0043025 0.1100471 neuronal cell body
GO:0030018 0.1123741 Z disc
GO:0008076 0.1157623 voltage-gated potassium channel complex
GO:0005912 0.1219669 adherens junction
GO:0034705 0.1240971 potassium channel complex
GO:0035097 0.1247547 histone methyltransferase complex
GO:0070161 0.1327431 anchoring junction
GO:0031674 0.1342464 I band
GO:0000785 0.1405080 chromatin
GO:0031093 0.1486384 platelet alpha granule lumen
GO:0033177 0.1486384 proton-transporting two-sector ATPase complex, proton-transporting domain
GO:0035861 0.1486384 site of double-strand break
GO:0034774 0.1566668 secretory granule lumen
GO:0000228 0.1581837 nuclear chromosome
GO:0097346 0.1646203 INO80-type complex
GO:0044297 0.1691931 cell body
GO:1902494 0.1702408 catalytic complex
GO:0005925 0.1704578 focal adhesion

*Molecular function

dim(goterms_mf)
[1] 285   3
kable(goterms_mf[1:50, ])
ID Pvalue Terms
GO:0005520 0.0041803 insulin-like growth factor binding
GO:0004556 0.0082278 alpha-amylase activity
GO:0004595 0.0082278 pantetheine-phosphate adenylyltransferase activity
GO:0004949 0.0082278 cannabinoid receptor activity
GO:0016160 0.0082278 amylase activity
GO:0008270 0.0089991 zinc ion binding
GO:0061631 0.0093864 ubiquitin conjugating enzyme activity
GO:0061650 0.0104352 ubiquitin-like protein conjugating enzyme activity
GO:0046914 0.0112368 transition metal ion binding
GO:0008201 0.0119451 heparin binding
GO:0005178 0.0130937 integrin binding
GO:0003883 0.0163887 CTP synthase activity
GO:0004140 0.0163887 dephospho-CoA kinase activity
GO:0016900 0.0163887 oxidoreductase activity, acting on the CH-OH group of donors, disulfide as acceptor
GO:0032090 0.0163887 Pyrin domain binding
GO:0047057 0.0163887 vitamin-K-epoxide reductase (warfarin-sensitive) activity
GO:1990254 0.0163887 keratin filament binding
GO:0005539 0.0220635 glycosaminoglycan binding
GO:0004322 0.0244834 ferroxidase activity
GO:0005251 0.0244834 delayed rectifier potassium channel activity
GO:0008199 0.0244834 ferric iron binding
GO:0016724 0.0244834 oxidoreductase activity, oxidizing metal ions, oxygen as acceptor
GO:1990050 0.0244834 phosphatidic acid transporter activity
GO:0000774 0.0325124 adenyl-nucleotide exchange factor activity
GO:0003918 0.0325124 DNA topoisomerase type II (ATP-hydrolyzing) activity
GO:0005212 0.0325124 structural constituent of eye lens
GO:0061505 0.0325124 DNA topoisomerase II activity
GO:0003696 0.0404761 satellite DNA binding
GO:0008494 0.0404761 translation activator activity
GO:0055131 0.0404761 C3HC4-type RING finger domain binding
GO:0003916 0.0483751 DNA topoisomerase activity
GO:0019215 0.0483751 intermediate filament binding
GO:0030368 0.0483751 interleukin-17 receptor activity
GO:0070087 0.0483751 chromo shadow domain binding
GO:0008083 0.0485652 growth factor activity
GO:0050839 0.0540124 cell adhesion molecule binding
GO:0044877 0.0550545 macromolecular complex binding
GO:0019237 0.0562100 centromeric DNA binding
GO:1901681 0.0605070 sulfur compound binding
GO:0008144 0.0635227 drug binding
GO:0004862 0.0639812 cAMP-dependent protein kinase inhibitor activity
GO:0008139 0.0639812 nuclear localization sequence binding
GO:0042826 0.0680492 histone deacetylase binding
GO:0008301 0.0716892 DNA binding, bending
GO:0090079 0.0716892 translation regulator activity, nucleic acid binding
GO:0048038 0.0793346 quinone binding
GO:0051787 0.0793346 misfolded protein binding
GO:0061630 0.0846794 ubiquitin protein ligase activity
GO:0008656 0.0869178 cysteine-type endopeptidase activator activity involved in apoptotic process
GO:0016722 0.0869178 oxidoreductase activity, oxidizing metal ions

Session information

sessionInfo()
R version 3.2.1 (2015-06-18)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: OS X 10.10.4 (Yosemite)

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] grid      stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] biomaRt_2.24.1   gridExtra_2.0.0  broman_0.59-5    zoo_1.7-12      
 [5] ggplot2_1.0.1    edgeR_3.10.5     limma_3.24.15    dplyr_0.4.3     
 [9] data.table_1.9.6 knitr_1.11      

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.1          highr_0.5.1          GenomeInfoDb_1.4.3  
 [4] formatR_1.2.1        plyr_1.8.3           bitops_1.0-6        
 [7] tools_3.2.1          digest_0.6.8         RSQLite_1.0.0       
[10] evaluate_0.8         gtable_0.1.2         lattice_0.20-33     
[13] DBI_0.3.1            yaml_2.1.13          parallel_3.2.1      
[16] proto_0.3-10         stringr_1.0.0        IRanges_2.2.9       
[19] S4Vectors_0.6.6      stats4_3.2.1         Biobase_2.28.0      
[22] R6_2.1.1             AnnotationDbi_1.30.1 XML_3.98-1.3        
[25] rmarkdown_0.8.1      reshape2_1.4.1       magrittr_1.5        
[28] BiocGenerics_0.14.0  scales_0.3.0         htmltools_0.2.6     
[31] MASS_7.3-44          assertthat_0.1       colorspace_1.2-6    
[34] labeling_0.3         stringi_0.5-5        RCurl_1.95-4.7      
[37] munsell_0.4.2        chron_2.3-47